1. Basic Data inspection¶

In [14]:
# importing all the necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.subplots as pp
import seaborn as sns
from wordcloud import WordCloud
from scipy.stats import chi2_contingency
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder, StandardScaler, MultiLabelBinarizer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, f1_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVR, SVC
from sklearn.decomposition import PCA
from sklearn.multioutput import MultiOutputClassifier
import json
import pickle
import bz2file as bz2

Loading dataset¶

In [15]:
df = pd.read_csv("Airbnb_Data.csv")
pd.set_option('display.max_columns', 100) # To view all columns
df.head()
Out[15]:
id log_price property_type room_type amenities accommodates bathrooms bed_type cancellation_policy cleaning_fee city description first_review host_has_profile_pic host_identity_verified host_response_rate host_since instant_bookable last_review latitude longitude name neighbourhood number_of_reviews review_scores_rating thumbnail_url zipcode bedrooms beds
0 6901257 5.010635 Apartment Entire home/apt {"Wireless Internet","Air conditioning",Kitche... 3 1.0 Real Bed strict True NYC Beautiful, sunlit brownstone 1-bedroom in the ... 2016-06-18 t t NaN 2012-03-26 f 2016-07-18 40.696524 -73.991617 Beautiful brownstone 1-bedroom Brooklyn Heights 2 100.0 https://a0.muscache.com/im/pictures/6d7cbbf7-c... 11201 1.0 1.0
1 6304928 5.129899 Apartment Entire home/apt {"Wireless Internet","Air conditioning",Kitche... 7 1.0 Real Bed strict True NYC Enjoy travelling during your stay in Manhattan... 2017-08-05 t f 100% 2017-06-19 t 2017-09-23 40.766115 -73.989040 Superb 3BR Apt Located Near Times Square Hell's Kitchen 6 93.0 https://a0.muscache.com/im/pictures/348a55fe-4... 10019 3.0 3.0
2 7919400 4.976734 Apartment Entire home/apt {TV,"Cable TV","Wireless Internet","Air condit... 5 1.0 Real Bed moderate True NYC The Oasis comes complete with a full backyard ... 2017-04-30 t t 100% 2016-10-25 t 2017-09-14 40.808110 -73.943756 The Garden Oasis Harlem 10 92.0 https://a0.muscache.com/im/pictures/6fae5362-9... 10027 1.0 3.0
3 13418779 6.620073 House Entire home/apt {TV,"Cable TV",Internet,"Wireless Internet",Ki... 4 1.0 Real Bed flexible True SF This light-filled home-away-from-home is super... NaN t t NaN 2015-04-19 f NaN 37.772004 -122.431619 Beautiful Flat in the Heart of SF! Lower Haight 0 NaN https://a0.muscache.com/im/pictures/72208dad-9... 94117.0 2.0 2.0
4 3808709 4.744932 Apartment Entire home/apt {TV,Internet,"Wireless Internet","Air conditio... 2 1.0 Real Bed moderate True DC Cool, cozy, and comfortable studio located in ... 2015-05-12 t t 100% 2015-03-01 t 2017-01-22 38.925627 -77.034596 Great studio in midtown DC Columbia Heights 4 40.0 NaN 20009 0.0 1.0

Shape and details of dataset¶

In [16]:
print('Rows: ',df.shape[0])
print('Columns: ',df.shape[1])
Rows:  74111
Columns:  29
In [17]:
print('Dataframe details: \n')
df.info(verbose=True)
Dataframe details: 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74111 entries, 0 to 74110
Data columns (total 29 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      74111 non-null  int64  
 1   log_price               74111 non-null  float64
 2   property_type           74111 non-null  object 
 3   room_type               74111 non-null  object 
 4   amenities               74111 non-null  object 
 5   accommodates            74111 non-null  int64  
 6   bathrooms               73911 non-null  float64
 7   bed_type                74111 non-null  object 
 8   cancellation_policy     74111 non-null  object 
 9   cleaning_fee            74111 non-null  bool   
 10  city                    74111 non-null  object 
 11  description             74111 non-null  object 
 12  first_review            58247 non-null  object 
 13  host_has_profile_pic    73923 non-null  object 
 14  host_identity_verified  73923 non-null  object 
 15  host_response_rate      55812 non-null  object 
 16  host_since              73923 non-null  object 
 17  instant_bookable        74111 non-null  object 
 18  last_review             58284 non-null  object 
 19  latitude                74111 non-null  float64
 20  longitude               74111 non-null  float64
 21  name                    74111 non-null  object 
 22  neighbourhood           67239 non-null  object 
 23  number_of_reviews       74111 non-null  int64  
 24  review_scores_rating    57389 non-null  float64
 25  thumbnail_url           65895 non-null  object 
 26  zipcode                 73145 non-null  object 
 27  bedrooms                74020 non-null  float64
 28  beds                    73980 non-null  float64
dtypes: bool(1), float64(7), int64(3), object(18)
memory usage: 15.9+ MB
In [18]:
df.columns
Out[18]:
Index(['id', 'log_price', 'property_type', 'room_type', 'amenities',
       'accommodates', 'bathrooms', 'bed_type', 'cancellation_policy',
       'cleaning_fee', 'city', 'description', 'first_review',
       'host_has_profile_pic', 'host_identity_verified', 'host_response_rate',
       'host_since', 'instant_bookable', 'last_review', 'latitude',
       'longitude', 'name', 'neighbourhood', 'number_of_reviews',
       'review_scores_rating', 'thumbnail_url', 'zipcode', 'bedrooms', 'beds'],
      dtype='object')

Basic statistical summary of the numerical columns¶

In [19]:
df.describe()
Out[19]:
id log_price accommodates bathrooms latitude longitude number_of_reviews review_scores_rating bedrooms beds
count 7.411100e+04 74111.000000 74111.000000 73911.000000 74111.000000 74111.000000 74111.000000 57389.000000 74020.000000 73980.000000
mean 1.126662e+07 4.782069 3.155146 1.235263 38.445958 -92.397525 20.900568 94.067365 1.265793 1.710868
std 6.081735e+06 0.717394 2.153589 0.582044 3.080167 21.705322 37.828641 7.836556 0.852143 1.254142
min 3.440000e+02 0.000000 1.000000 0.000000 33.338905 -122.511500 0.000000 20.000000 0.000000 0.000000
25% 6.261964e+06 4.317488 2.000000 1.000000 34.127908 -118.342374 1.000000 92.000000 1.000000 1.000000
50% 1.225415e+07 4.709530 2.000000 1.000000 40.662138 -76.996965 6.000000 96.000000 1.000000 1.000000
75% 1.640226e+07 5.220356 4.000000 1.000000 40.746096 -73.954660 23.000000 100.000000 1.000000 2.000000
max 2.123090e+07 7.600402 16.000000 8.000000 42.390437 -70.985047 605.000000 100.000000 10.000000 18.000000

Null Values¶

In [20]:
print('\nNull values in dataset:\n')
df.isnull().sum().sort_values(ascending=False)
Null values in dataset:

Out[20]:
host_response_rate        18299
review_scores_rating      16722
first_review              15864
last_review               15827
thumbnail_url              8216
neighbourhood              6872
zipcode                     966
bathrooms                   200
host_identity_verified      188
host_since                  188
host_has_profile_pic        188
beds                        131
bedrooms                     91
description                   0
name                          0
property_type                 0
room_type                     0
amenities                     0
number_of_reviews             0
accommodates                  0
longitude                     0
city                          0
latitude                      0
bed_type                      0
instant_bookable              0
cancellation_policy           0
cleaning_fee                  0
log_price                     0
id                            0
dtype: int64

2. Visualization¶

Converting boolean and log values¶

In [21]:
# Replacing columns with f/t and boolean with 0/1
df = df.infer_objects(copy=False).replace({'f': 0, 't': 1})
df = df.infer_objects(copy=False).replace({False: 0, True: 1})

df['price'] = round(np.exp(df['log_price']), 2)
C:\Users\robbi\AppData\Local\Temp\ipykernel_220\600328113.py:2: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
  df = df.infer_objects(copy=False).replace({'f': 0, 't': 1})
C:\Users\robbi\AppData\Local\Temp\ipykernel_220\600328113.py:3: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
  df = df.infer_objects(copy=False).replace({False: 0, True: 1})

Calculating price per person¶

In [22]:
df['price_per_person'] = df['price']/df['accommodates']

Histogram for all numerical data¶

In [23]:
# Plotting the distribution of numerical features
numerical_features = ['accommodates', 'bathrooms', 'beds', 'bedrooms', 'review_scores_rating', 'number_of_reviews', 'host_response_rate']
df[numerical_features].hist(figsize=(20,20));
No description has been provided for this image

Box plots¶

In [24]:
plt.figure(figsize=(10, 6))
sns.boxplot(data=df[numerical_features])
plt.xlabel('Features')
plt.ylabel('Values')
plt.title('Box Plot of Numerical Features')
plt.xticks(rotation=45)
plt.show()
No description has been provided for this image

Price Distribution Histogram¶

In [25]:
plt.figure(figsize=(10, 6))
plt.hist(df['price'], bins=30, color='skyblue', edgecolor='black', alpha=.8, label='Price')
plt.hist(df['price_per_person'], bins=30, color='orange', edgecolor='red', alpha=.5, label='Price Per Person')
plt.title('Price Distribution Histogram')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.legend()
plt.grid(True)
# plt.savefig("price.png", dpi=1000)
plt.show()
No description has been provided for this image

Price by room_type¶

In [26]:
avg_price_by_room_type = df.groupby('room_type')['price'].mean().sort_values(ascending=False)
plt.figure(figsize=(10, 6))
bars = avg_price_by_room_type.plot(kind='bar', color='lightgreen')
for i, v in enumerate(avg_price_by_room_type):
    plt.text(i, v + 1, str(round(v, 2)), ha='center', va='bottom', fontsize=7)
plt.title('Average Price by Room Type')
plt.xlabel('Room Type')
plt.ylabel('Average Price')
plt.xticks(rotation=45)
plt.grid(axis='y')
# plt.savefig("roomType_price.png", dpi=1000)
plt.show()
No description has been provided for this image

Instant Bookable Property Percentage¶

In [27]:
plt.figure(figsize=(8, 6))
instant_bookable_counts = df['instant_bookable'].value_counts()
labels = np.where(instant_bookable_counts.index == 1, 'Yes', 'No')
instant_bookable_counts.plot(kind='pie', autopct='%1.1f%%', colors=['lightcoral', 'lightblue'], labels=labels)
plt.title('Instant Bookable Property Percentage')
plt.ylabel('')
plt.show()
No description has been provided for this image

Price Trends Over Time as 'first_review' is in datetime format¶

In [28]:
df['first_review'] = pd.to_datetime(df['first_review'])
df['first_review'].dt.year
Out[28]:
0        2016.0
1        2017.0
2        2017.0
3           NaN
4        2015.0
          ...  
74106       NaN
74107    2016.0
74108    2015.0
74109       NaN
74110    2013.0
Name: first_review, Length: 74111, dtype: float64
In [29]:
df['year'] = df['first_review'].dt.year
avg_price_by_year = df.dropna(subset=['first_review']).groupby('year')['price'].mean()
avg_price_per_person_by_year = df.dropna(subset=['first_review']).groupby('year')['price_per_person'].mean()
plt.figure(figsize=(10, 6))
avg_price_by_year.plot(marker='o', color='orange', label='Total Price')
avg_price_per_person_by_year.plot(marker='o', color='green', label='Price per person')
plt.title('Average Price Trends Over Time')
plt.xlabel('Year')
plt.ylabel('Average Price')
plt.grid(True)
plt.xticks(avg_price_by_year.index)
plt.legend(loc="upper right")
# plt.savefig("price_trend.png", dpi=1000)
plt.show()
No description has been provided for this image

Average price by City¶

In [30]:
plt.figure(figsize=(10, 6))
avg_price_by_city = df.groupby('city')['price'].mean().sort_values(ascending=False)
bars = avg_price_by_city.plot(kind='bar', color='lightcoral')
for i, v in enumerate(avg_price_by_city):
    plt.text(i, v + 1, str(round(v, 2)), ha='center', va='bottom', fontsize=7)
plt.title('Average Price by City')
plt.xlabel('City')
plt.ylabel('Average Price')
plt.xticks(rotation=45)
plt.grid(axis='y')
plt.show()
No description has been provided for this image

Distrubition of price on map¶

In [31]:
color_scale = [(0, 'green'), (1,'red')]

fig = px.scatter_mapbox(df,
                        lat="latitude",
                        lon="longitude", 
                        hover_name="neighbourhood", 
                        hover_data=["neighbourhood", "accommodates"],
                        color="price",
                        color_continuous_scale=color_scale,
                        # size="price",
                        zoom=3, 
                        height=800,
                        width=800)

fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()

Amenities¶

In [32]:
amenities_list = df['amenities'].str.replace('{', '').str.replace('}', ',').str.replace('"', '')
df['amenities_clean'] = amenities_list
amenities_list
Out[32]:
0        Wireless Internet,Air conditioning,Kitchen,Hea...
1        Wireless Internet,Air conditioning,Kitchen,Hea...
2        TV,Cable TV,Wireless Internet,Air conditioning...
3        TV,Cable TV,Internet,Wireless Internet,Kitchen...
4        TV,Internet,Wireless Internet,Air conditioning...
                               ...                        
74106                                                    ,
74107    TV,Cable TV,Internet,Wireless Internet,Kitchen...
74108    TV,Internet,Wireless Internet,Air conditioning...
74109    TV,Wireless Internet,Air conditioning,Kitchen,...
74110    TV,Internet,Wireless Internet,Kitchen,Free par...
Name: amenities, Length: 74111, dtype: object
In [33]:
# amenities_list = list(df.amenities)
amenities_list_all = " ".join(amenities_list)

amenities_list_all = [amenity.strip().lower()
                  .replace("(", "")
                  .replace(")", "")
                  .replace("_", " ")
                  .replace(" & ", " and ")
                  .replace("-", " ")
                  for amenity in amenities_list_all.split(',')
                  if 'translation missing: ' not in amenity]
amenities_set = set(amenities_list_all)
len(amenities_set), amenities_set
Out[33]:
(128,
 {'',
  '24 hour check in',
  'accessible height bed',
  'accessible height toilet',
  'air conditioning',
  'air purifier',
  'baby bath',
  'baby monitor',
  'babysitter recommendations',
  'bath towel',
  'bathtub',
  'bathtub with shower chair',
  'bbq grill',
  'beach essentials',
  'beachfront',
  'bed linens',
  'body soap',
  'breakfast',
  'buzzer/wireless intercom',
  'cable tv',
  'carbon monoxide detector',
  'cats',
  'changing table',
  'children’s books and toys',
  'children’s dinnerware',
  'cleaning before checkout',
  'coffee maker',
  'cooking basics',
  'crib',
  'disabled parking spot',
  'dishes and silverware',
  'dishwasher',
  'dogs',
  'doorman',
  'doorman entry',
  'dryer',
  'elevator',
  'elevator in building',
  'essentials',
  'ethernet connection',
  'ev charger',
  'extra pillows and blankets',
  'family/kid friendly',
  'fire extinguisher',
  'fireplace guards',
  'firm matress',
  'firm mattress',
  'first aid kit',
  'fixed grab bars for shower and toilet',
  'flat',
  'flat smooth pathway to front door',
  'free parking on premises',
  'free parking on street',
  'game console',
  'garden or backyard',
  'grab rails for shower and toilet',
  'ground floor access',
  'gym',
  'hair dryer',
  'hand or paper towel',
  'hand soap',
  'handheld shower head',
  'hangers',
  'heating',
  'high chair',
  'host greets you',
  'hot tub',
  'hot water',
  'hot water kettle',
  'indoor fireplace',
  'internet',
  'iron',
  'keypad',
  'kitchen',
  'lake access',
  'laptop friendly workspace',
  'lock on bedroom door',
  'lockbox',
  'long term stays allowed',
  'luggage dropoff allowed',
  'microwave',
  'other',
  'other pets',
  'outlet covers',
  'oven',
  'pack ’n play/travel crib',
  'paid parking off premises',
  'path to entrance lit at night',
  'patio or balcony',
  'pets allowed',
  'pets live on this property',
  'pocket wifi',
  'pool',
  'private bathroom',
  'private entrance',
  'private living room',
  'refrigerator',
  'roll in shower with chair',
  'room darkening shades',
  'safety card',
  'self check in',
  'shampoo',
  'single level home',
  'ski in/ski out',
  'smart lock',
  'smartlock',
  'smoke detector',
  'smoking allowed',
  'smooth pathway to front door',
  'stair gates',
  'step free access',
  'stove',
  'suitable for events',
  'table corner guards',
  'toilet paper',
  'tv',
  'washer',
  'washer / dryer',
  'waterfront',
  'well lit path to entrance',
  'wheelchair accessible',
  'wide clearance to bed',
  'wide clearance to shower and toilet',
  'wide doorway',
  'wide entryway',
  'wide hallway clearance',
  'window guards',
  'wireless internet'})
In [34]:
from collections import Counter

word_count_dict = Counter(amenities_list_all)
wordcloud = WordCloud(width=1000, height=500, background_color='white').generate_from_frequencies(word_count_dict)

plt.figure(figsize=(15, 8))
plt.imshow(wordcloud, interpolation='nearest')
plt.axis("off")
# plt.savefig("wordcloud.png", bbox_inches="tight")
plt.show()
No description has been provided for this image

3. Data Preprocessing¶

In [35]:
df['host_response_rate'] = df['host_response_rate'].str.replace('%', '').astype('float')
df.head()
Out[35]:
id log_price property_type room_type amenities accommodates bathrooms bed_type cancellation_policy cleaning_fee city description first_review host_has_profile_pic host_identity_verified host_response_rate host_since instant_bookable last_review latitude longitude name neighbourhood number_of_reviews review_scores_rating thumbnail_url zipcode bedrooms beds price price_per_person year amenities_clean
0 6901257 5.010635 Apartment Entire home/apt {"Wireless Internet","Air conditioning",Kitche... 3 1.0 Real Bed strict 1 NYC Beautiful, sunlit brownstone 1-bedroom in the ... 2016-06-18 1.0 1.0 NaN 2012-03-26 0 2016-07-18 40.696524 -73.991617 Beautiful brownstone 1-bedroom Brooklyn Heights 2 100.0 https://a0.muscache.com/im/pictures/6d7cbbf7-c... 11201 1.0 1.0 150.0 50.000000 2016.0 Wireless Internet,Air conditioning,Kitchen,Hea...
1 6304928 5.129899 Apartment Entire home/apt {"Wireless Internet","Air conditioning",Kitche... 7 1.0 Real Bed strict 1 NYC Enjoy travelling during your stay in Manhattan... 2017-08-05 1.0 0.0 100.0 2017-06-19 1 2017-09-23 40.766115 -73.989040 Superb 3BR Apt Located Near Times Square Hell's Kitchen 6 93.0 https://a0.muscache.com/im/pictures/348a55fe-4... 10019 3.0 3.0 169.0 24.142857 2017.0 Wireless Internet,Air conditioning,Kitchen,Hea...
2 7919400 4.976734 Apartment Entire home/apt {TV,"Cable TV","Wireless Internet","Air condit... 5 1.0 Real Bed moderate 1 NYC The Oasis comes complete with a full backyard ... 2017-04-30 1.0 1.0 100.0 2016-10-25 1 2017-09-14 40.808110 -73.943756 The Garden Oasis Harlem 10 92.0 https://a0.muscache.com/im/pictures/6fae5362-9... 10027 1.0 3.0 145.0 29.000000 2017.0 TV,Cable TV,Wireless Internet,Air conditioning...
3 13418779 6.620073 House Entire home/apt {TV,"Cable TV",Internet,"Wireless Internet",Ki... 4 1.0 Real Bed flexible 1 SF This light-filled home-away-from-home is super... NaT 1.0 1.0 NaN 2015-04-19 0 NaN 37.772004 -122.431619 Beautiful Flat in the Heart of SF! Lower Haight 0 NaN https://a0.muscache.com/im/pictures/72208dad-9... 94117.0 2.0 2.0 750.0 187.500000 NaN TV,Cable TV,Internet,Wireless Internet,Kitchen...
4 3808709 4.744932 Apartment Entire home/apt {TV,Internet,"Wireless Internet","Air conditio... 2 1.0 Real Bed moderate 1 DC Cool, cozy, and comfortable studio located in ... 2015-05-12 1.0 1.0 100.0 2015-03-01 1 2017-01-22 38.925627 -77.034596 Great studio in midtown DC Columbia Heights 4 40.0 NaN 20009 0.0 1.0 115.0 57.500000 2015.0 TV,Internet,Wireless Internet,Air conditioning...
In [36]:
df['first_review'] = pd.to_datetime(df['first_review'])
df['last_review'] = pd.to_datetime(df['last_review'])
df['host_since'] = pd.to_datetime(df['host_since'])

missing_start = df[df['host_since'].isnull() & df['first_review'].notnull()]
df.loc[df['host_since'].isnull() & df['first_review'].notnull(), ['host_since']] = missing_start['first_review']

df[df['host_since'].isnull() & df['last_review'].notnull()]
df.loc[df['host_since'].isnull() & df['last_review'].notnull(), ['host_since']] = missing_start['last_review']

columns_to_handle = {
    "host_response_rate": "median",
    "review_scores_rating": "median",
    "bathrooms": "mean",
    "beds": "mean",
    "bedrooms": "mean",
    "first_review": "min",
    "last_review": "max",
    "thumbnail_url": "unknown",
    "neighbourhood": "mode",
    "zipcode": "mode",
    "host_identity_verified": "mode",
    "host_has_profile_pic": "mode",
    "host_since": "min"
}

# Group data by 'accommodates' and calculate mean for each group
mean_bedrooms = df.groupby('accommodates')['bedrooms'].mean()
mean_bathrooms = df.groupby('accommodates')['bathrooms'].mean()
mean_beds = df.groupby('accommodates')['beds'].mean()

for column, strategy in columns_to_handle.items():
    if strategy == "mean":
        mean = df.groupby('accommodates')[column].mean()
        df[column] = df[column].fillna(df['accommodates'].map(mean))
    if strategy == "median":
        df[column] = df[column].fillna(df[column].median())
    elif strategy == "mode":
        df[column] = df[column].fillna(df[column].mode()[0])
    elif strategy == "min":
        df[column] = df[column].fillna(df[column].min())
    elif strategy == "max":
        df[column] = df[column].fillna(df[column].max())
    elif strategy == "unknown":
        df[column] = df[column].fillna("unknown")
df.isnull().sum()
Out[36]:
id                            0
log_price                     0
property_type                 0
room_type                     0
amenities                     0
accommodates                  0
bathrooms                     0
bed_type                      0
cancellation_policy           0
cleaning_fee                  0
city                          0
description                   0
first_review                  0
host_has_profile_pic          0
host_identity_verified        0
host_response_rate            0
host_since                    0
instant_bookable              0
last_review                   0
latitude                      0
longitude                     0
name                          0
neighbourhood                 0
number_of_reviews             0
review_scores_rating          0
thumbnail_url                 0
zipcode                       0
bedrooms                      0
beds                          0
price                         0
price_per_person              0
year                      15864
amenities_clean               0
dtype: int64
In [37]:
df1 = df.drop(["id", "name", "description", "first_review", "host_since", "last_review", "neighbourhood", "thumbnail_url", "zipcode"], axis = 1)
ordinal_encoder = OrdinalEncoder(categories=[['Shared room', 'Private room', 'Entire home/apt'],
                                             ['Airbed', 'Couch', 'Pull-out Sofa', 'Futon', 'Real Bed'],
                                             ['flexible', 'moderate', 'strict', 'super_strict_30', 'super_strict_60']]
                                )
df1[['room_type', 'bed_type', 'cancellation_policy']] = ordinal_encoder.fit_transform(df[['room_type', 'bed_type', 'cancellation_policy']])
label_df = df1.copy().drop(['amenities_clean', 'amenities'], axis = 1)
categorical_col = []
for column in label_df.columns:
    if label_df[column].dtypes != "float64" and label_df[column].dtypes != "int64":
        categorical_col.append(column)
le = LabelEncoder()
for col in categorical_col:
    label_df[col] = le.fit_transform(label_df[col])

correlation_matrix = label_df.corr()
plt.figure(figsize = (20,20))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f")
plt.show()
No description has been provided for this image
In [38]:
# Perform one-hot encoding for nominal columns
one_hot_encoder = OneHotEncoder(drop=None, sparse_output=False)
# one_hot_encoder.fit(df[['property_type', 'city']])
encoded_features = one_hot_encoder.fit_transform(df[['property_type', 'city']])
encoded_df = pd.DataFrame(encoded_features, columns=one_hot_encoder.get_feature_names_out(['property_type', 'city']))
encoded_df.head()

df1 = pd.concat([df1.drop(['property_type', 'city'], axis=1), encoded_df], axis=1)
one_hot_encoder.categories_
Out[38]:
[array(['Apartment', 'Bed & Breakfast', 'Boat', 'Boutique hotel',
        'Bungalow', 'Cabin', 'Camper/RV', 'Casa particular', 'Castle',
        'Cave', 'Chalet', 'Condominium', 'Dorm', 'Earth House',
        'Guest suite', 'Guesthouse', 'Hostel', 'House', 'Hut', 'In-law',
        'Island', 'Lighthouse', 'Loft', 'Other', 'Parking Space',
        'Serviced apartment', 'Tent', 'Timeshare', 'Tipi', 'Townhouse',
        'Train', 'Treehouse', 'Vacation home', 'Villa', 'Yurt'],
       dtype=object),
 array(['Boston', 'Chicago', 'DC', 'LA', 'NYC', 'SF'], dtype=object)]
In [39]:
correlation_with_target = correlation_matrix['log_price']
sorted_correlation = correlation_with_target.abs().sort_values(ascending=False)
sorted_correlation
Out[39]:
log_price                 1.000000
price                     0.840001
room_type                 0.607125
accommodates              0.567574
price_per_person          0.516145
bedrooms                  0.473084
beds                      0.442123
bathrooms                 0.355397
cancellation_policy       0.131869
cleaning_fee              0.111191
bed_type                  0.088367
review_scores_rating      0.084180
year                      0.080714
property_type             0.048741
longitude                 0.047529
instant_bookable          0.044271
number_of_reviews         0.032470
city                      0.030913
host_identity_verified    0.024014
host_has_profile_pic      0.013171
latitude                  0.002193
host_response_rate        0.001423
Name: log_price, dtype: float64
In [40]:
categorical_features = ['property_type', 'room_type', 'bed_type', 'cancellation_policy', 'cleaning_fee', 'city',
                       'host_has_profile_pic', 'host_identity_verified', 'instant_bookable']
target_variable = 'log_price'

# Chi-square test for independence
print("Chi-square test for independence:")
for feature in categorical_features:
    contingency_table = pd.crosstab(df[feature], df[target_variable])
    chi2, p_value, _, _ = chi2_contingency(contingency_table)
    if p_value < 0.05:
        print(f"{feature}: Useful (p-value={p_value})")
    else:
        print(f"{feature}: Not Useful (p-value={p_value})")
Chi-square test for independence:
property_type: Useful (p-value=0.0)
room_type: Useful (p-value=0.0)
bed_type: Not Useful (p-value=0.07518663315848466)
cancellation_policy: Useful (p-value=0.0)
cleaning_fee: Useful (p-value=0.0)
city: Useful (p-value=0.0)
host_has_profile_pic: Useful (p-value=1.331633061779495e-41)
host_identity_verified: Useful (p-value=2.2559278540162352e-55)
instant_bookable: Useful (p-value=1.3414375212905722e-40)

Scaling¶

Standard scaler¶

In [41]:
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df[numerical_features])
df1[numerical_features] = scaled_features
plt.figure(figsize=(10, 6))
sns.boxplot(data=scaled_features)
plt.xlabel('Features')
plt.ylabel('Values')
plt.title('Box Plot of Numerical Features after scaling')
plt.xticks(rotation=45)
plt.show()
No description has been provided for this image

Model¶

In [42]:
p_type_count = df.groupby('property_type').id.count()
p_type_count.sort_values()
Out[42]:
property_type
Parking Space             1
Lighthouse                1
Island                    1
Casa particular           1
Train                     2
Cave                      2
Tipi                      3
Earth House               4
Chalet                    6
Treehouse                 7
Hut                       8
Yurt                      9
Vacation home            11
Castle                   13
Tent                     18
Serviced apartment       21
Boat                     65
Boutique hotel           69
Hostel                   70
In-law                   71
Cabin                    72
Timeshare                77
Camper/RV                94
Guest suite             123
Dorm                    142
Villa                   179
Bungalow                366
Bed & Breakfast         462
Guesthouse              498
Other                   607
Loft                   1244
Townhouse              1692
Condominium            2658
House                 16511
Apartment             49003
Name: id, dtype: int64
In [43]:
few_cats = p_type_count[p_type_count < 5].index.tolist()
cleaned_df = df.drop(df[df['property_type'].isin(few_cats)].index)
cleaned_df[cleaned_df['property_type'].isin(few_cats)]
Out[43]:
id log_price property_type room_type amenities accommodates bathrooms bed_type cancellation_policy cleaning_fee city description first_review host_has_profile_pic host_identity_verified host_response_rate host_since instant_bookable last_review latitude longitude name neighbourhood number_of_reviews review_scores_rating thumbnail_url zipcode bedrooms beds price price_per_person year amenities_clean
In [63]:
features = ['accommodates','bedrooms','beds','bathrooms',
             'property_type', 'room_type', 'cancellation_policy', 'cleaning_fee', 'city',
             'host_has_profile_pic', 'host_identity_verified', 'instant_bookable']
target = 'log_price'

ordinal_encoder = OrdinalEncoder(categories=[['Shared room', 'Private room', 'Entire home/apt'],
                                             ['flexible', 'moderate', 'strict', 'super_strict_30', 'super_strict_60']])

preprocessor = ColumnTransformer(
    transformers=[
        ('encoder', OneHotEncoder(drop=None, sparse_output=False), ['property_type', 'city']),
        ('ord', ordinal_encoder, ['room_type', 'cancellation_policy']),
        ('scaler', StandardScaler(), ['accommodates','bedrooms','beds','bathrooms'])
    ],
    remainder='passthrough')
linear_model = LinearRegression()
linear_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('regressor', linear_model)])

# Split data into train and test sets
X = cleaned_df[features]
y = cleaned_df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the pipeline and evaluate
linear_pipeline.fit(X, y)
y_pred = linear_pipeline.predict(X_test)
mse_linear = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse_linear)
r2_linear = r2_score(y_test, y_pred)
print("R2 Score: ", r2_linear)
Mean Squared Error: 0.2260184139932397
R2 Score:  0.5554416423023205
In [64]:
indices = range(len(y_test))
plt.figure(figsize=(20, 6))
plt.plot(indices, y_test, label='Actual', color='blue')
plt.plot(indices, y_pred, label='Predicted', color='red')
plt.xlabel('Index')
plt.ylabel('Values')
plt.title('Actual vs Predicted Values')
plt.legend()
plt.show()
No description has been provided for this image
In [65]:
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('regressor', rf_model)])

# Fit the pipeline and evaluate
rf_pipeline.fit(X_train, y_train)
y_pred = rf_pipeline.predict(X_test)
mse_rf = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse_rf)

r2_rf = r2_score(y_test, y_pred)
print("R2 Score: ", r2_rf)
Mean Squared Error: 0.22805464552677088
R2 Score:  0.5514365538210523
In [66]:
encoded_feature_names = rf_pipeline.named_steps['preprocessor'].transformers_[0][1].get_feature_names_out(['property_type', 'city'])
other_features = ['accommodates', 'bedrooms', 'beds', 'bathrooms', 'room_type', 'cancellation_policy', 'cleaning_fee',
                  'host_has_profile_pic', 'host_identity_verified', 'instant_bookable']
feature_names = list(encoded_feature_names) + other_features
feature_importances = pd.Series(rf_pipeline.named_steps['regressor'].feature_importances_, index=feature_names)

sorted_feature_importances = feature_importances.sort_values()
plt.figure(figsize=(10,10))
sns.barplot(x=sorted_feature_importances.values, y=sorted_feature_importances.index, hue=sorted_feature_importances.index, orient="h", palette="Blues_d", legend=False)
plt.title("Top Feature Importances")
plt.xlabel("Feature Importance")
plt.ylabel("Feature")
plt.show()
No description has been provided for this image
In [67]:
svr_model = SVR(kernel='rbf')
svr_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('regressor', svr_model)])

# Fit the pipeline and evaluate
svr_pipeline.fit(X_train, y_train)
y_pred = svr_pipeline.predict(X_test)
mse_svr = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse_svr)

r2_svr = r2_score(y_test, y_pred)
print("R2 Score: ", r2_svr)
Mean Squared Error: 0.2062936099783641
R2 Score:  0.5942385983725634

RFE¶

In [68]:
rfe = RFE(estimator=linear_model, n_features_to_select=10)
rfe_pipeline = Pipeline(steps=[('preprocesser', preprocessor), ('rfe', rfe), ('model', linear_model)])
rfe_pipeline.fit(X, y)
Out[68]:
Pipeline(steps=[('preprocesser',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('encoder',
                                                  OneHotEncoder(sparse_output=False),
                                                  ['property_type', 'city']),
                                                 ('ord',
                                                  OrdinalEncoder(categories=[['Shared '
                                                                              'room',
                                                                              'Private '
                                                                              'room',
                                                                              'Entire '
                                                                              'home/apt'],
                                                                             ['flexible',
                                                                              'moderate',
                                                                              'strict',
                                                                              'super_strict_30',
                                                                              'super_strict_60']]),
                                                  ['room_type',
                                                   'cancellation_policy']),
                                                 ('scaler', StandardScaler(),
                                                  ['accommodates', 'bedrooms',
                                                   'beds', 'bathrooms'])])),
                ('rfe',
                 RFE(estimator=LinearRegression(), n_features_to_select=10)),
                ('model', LinearRegression())])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('preprocesser',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('encoder',
                                                  OneHotEncoder(sparse_output=False),
                                                  ['property_type', 'city']),
                                                 ('ord',
                                                  OrdinalEncoder(categories=[['Shared '
                                                                              'room',
                                                                              'Private '
                                                                              'room',
                                                                              'Entire '
                                                                              'home/apt'],
                                                                             ['flexible',
                                                                              'moderate',
                                                                              'strict',
                                                                              'super_strict_30',
                                                                              'super_strict_60']]),
                                                  ['room_type',
                                                   'cancellation_policy']),
                                                 ('scaler', StandardScaler(),
                                                  ['accommodates', 'bedrooms',
                                                   'beds', 'bathrooms'])])),
                ('rfe',
                 RFE(estimator=LinearRegression(), n_features_to_select=10)),
                ('model', LinearRegression())])
ColumnTransformer(remainder='passthrough',
                  transformers=[('encoder', OneHotEncoder(sparse_output=False),
                                 ['property_type', 'city']),
                                ('ord',
                                 OrdinalEncoder(categories=[['Shared room',
                                                             'Private room',
                                                             'Entire home/apt'],
                                                            ['flexible',
                                                             'moderate',
                                                             'strict',
                                                             'super_strict_30',
                                                             'super_strict_60']]),
                                 ['room_type', 'cancellation_policy']),
                                ('scaler', StandardScaler(),
                                 ['accommodates', 'bedrooms', 'beds',
                                  'bathrooms'])])
['property_type', 'city']
OneHotEncoder(sparse_output=False)
['room_type', 'cancellation_policy']
OrdinalEncoder(categories=[['Shared room', 'Private room', 'Entire home/apt'],
                           ['flexible', 'moderate', 'strict', 'super_strict_30',
                            'super_strict_60']])
['accommodates', 'bedrooms', 'beds', 'bathrooms']
StandardScaler()
['cleaning_fee', 'host_has_profile_pic', 'host_identity_verified', 'instant_bookable']
passthrough
RFE(estimator=LinearRegression(), n_features_to_select=10)
LinearRegression()
LinearRegression()
LinearRegression()
In [69]:
feature_rankings =  rfe_pipeline.named_steps['rfe'].ranking_
plt.figure(figsize=(10, 6))
plt.title("RFE Feature Ranking")
plt.xlabel("Number of features selected")
plt.ylabel("Feature ranking")
plt.plot(range(1, len(feature_rankings) + 1), feature_rankings)
plt.show()
No description has been provided for this image
In [70]:
selected_features = [feature_names[i] for i, support in enumerate(rfe_pipeline.named_steps['rfe'].support_) if support]
print("Selected features:", selected_features)
Selected features: ['property_type_Apartment', 'property_type_Bed & Breakfast', 'property_type_Cabin', 'property_type_Camper/RV', 'property_type_Dorm', 'property_type_Guest suite', 'property_type_Guesthouse', 'property_type_Hostel', 'property_type_Hut', 'property_type_Tent']

Principal Componenet Analysis¶

In [71]:
pca_pipeline = Pipeline([('preprocessor', preprocessor), ('pca', PCA())])
pca_pipeline.fit_transform(X)
pipeline = Pipeline([
    ('pca', pca_pipeline),
    ('regressor', LinearRegression())
])
cv_scores = cross_val_score(pipeline, X, y, cv=5, scoring='neg_mean_squared_error')
mse_cv = -cv_scores.mean()
plt.figure(figsize=(10, 6))
pca = pca_pipeline.named_steps['pca']
plt.plot(range(1, len(pca.explained_variance_ratio_) + 1), np.cumsum(pca.explained_variance_ratio_), marker='o', linestyle='-')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance Ratio')
plt.title('Explained Variance Ratio by Number of Components')
plt.grid(True)
plt.show()
No description has been provided for this image
In [72]:
linear_pipeline_pca = Pipeline([
    ('pca', Pipeline([('preprocessor', preprocessor), ('pca', PCA(n_components=20))])), 
    ('regressor', LinearRegression())
])
linear_pipeline_pca.fit(X_train, y_train)
y_pred = linear_pipeline_pca.predict(X_test)
mse_pca = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse_pca)

r2_pca = r2_score(y_test, y_pred)
print("R2 Score: ", r2_pca)
Mean Squared Error: 0.22747886811543103
R2 Score:  0.5525690573895108
In [73]:
svr_model_pca = SVR(kernel='rbf')
svr_pipeline_pca = Pipeline(steps=[('pca', Pipeline([('preprocessor', preprocessor), ('pca', PCA(n_components=20))])),
                                   ('regressor', svr_model)])

# Fit the pipeline and evaluate
svr_pipeline_pca.fit(X_train, y_train)
y_pred = svr_pipeline_pca.predict(X_test)
mse_svr_pca = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse_svr_pca)

r2_svr_pca = r2_score(y_test, y_pred)
print("R2 Score: ", r2_svr_pca)
Mean Squared Error: 0.2080072706676499
R2 Score:  0.5908679783942161
In [45]:
amenities_text = df['amenities_clean'].apply(lambda x: x.replace('translation missing: en.hosting_amenity_49', '')
                                             .replace('translation missing: en.hosting_amenity_50', '')
                                             .split(','))
amenities_text = amenities_text.apply(lambda x: list(filter(None, x)))
amenities_text
Out[45]:
0        [Wireless Internet, Air conditioning, Kitchen,...
1        [Wireless Internet, Air conditioning, Kitchen,...
2        [TV, Cable TV, Wireless Internet, Air conditio...
3        [TV, Cable TV, Internet, Wireless Internet, Ki...
4        [TV, Internet, Wireless Internet, Air conditio...
                               ...                        
74106                                                   []
74107    [TV, Cable TV, Internet, Wireless Internet, Ki...
74108    [TV, Internet, Wireless Internet, Air conditio...
74109    [TV, Wireless Internet, Air conditioning, Kitc...
74110    [TV, Internet, Wireless Internet, Kitchen, Fre...
Name: amenities_clean, Length: 74111, dtype: object
In [46]:
mlb = MultiLabelBinarizer()
amenities_encoded = mlb.fit_transform(amenities_text)
amenities_df = pd.DataFrame(amenities_encoded, columns=mlb.classes_)
amenities_df = cleaned_df[features + ['price']].join(amenities_df)
amenities_df.head()
Out[46]:
accommodates bedrooms beds bathrooms property_type room_type cancellation_policy cleaning_fee city host_has_profile_pic host_identity_verified instant_bookable price smooth pathway to front door 24-hour check-in Accessible-height bed Accessible-height toilet Air conditioning Air purifier BBQ grill Baby bath Baby monitor Babysitter recommendations Bath towel Bathtub Bathtub with shower chair Beach essentials Beachfront Bed linens Body soap Breakfast Buzzer/wireless intercom Cable TV Carbon monoxide detector Cat(s) Changing table Children’s books and toys Children’s dinnerware Cleaning before checkout Coffee maker Cooking basics Crib Disabled parking spot Dishes and silverware Dishwasher Dog(s) Doorman Doorman Entry Dryer EV charger ... Long term stays allowed Luggage dropoff allowed Microwave Other Other pet(s) Outlet covers Oven Pack ’n Play/travel crib Paid parking off premises Path to entrance lit at night Patio or balcony Pets allowed Pets live on this property Pocket wifi Pool Private bathroom Private entrance Private living room Refrigerator Roll-in shower with chair Room-darkening shades Safety card Self Check-In Shampoo Single level home Ski in/Ski out Smart lock Smartlock Smoke detector Smoking allowed Stair gates Step-free access Stove Suitable for events TV Table corner guards Toilet paper Washer Washer / Dryer Waterfront Well-lit path to entrance Wheelchair accessible Wide clearance to bed Wide clearance to shower & toilet Wide clearance to shower and toilet Wide doorway Wide entryway Wide hallway clearance Window guards Wireless Internet
0 3 1.0 1.0 1.0 Apartment Entire home/apt strict 1 NYC 1.0 1.0 0 150.0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
1 7 3.0 3.0 1.0 Apartment Entire home/apt strict 1 NYC 1.0 0.0 1 169.0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 ... 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1
2 5 1.0 3.0 1.0 Apartment Entire home/apt moderate 1 NYC 1.0 1.0 1 145.0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
3 4 2.0 2.0 1.0 House Entire home/apt flexible 1 SF 1.0 1.0 0 750.0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 ... 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1
4 2 0.0 1.0 1.0 Apartment Entire home/apt moderate 1 DC 1.0 1.0 1 115.0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1

5 rows × 141 columns

In [51]:
from xgboost import XGBClassifier
In [52]:
X = amenities_df.drop(list(mlb.classes_), axis=1)
y = amenities_df[list(mlb.classes_)]
# y_flattened = y.values.argmax(axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

preprocessor = ColumnTransformer(
    transformers=[
        ('encoder', OneHotEncoder(drop=None, sparse_output=False), ['property_type', 'city']),
        ('ord', ordinal_encoder, ['room_type', 'cancellation_policy']),
        ('scaler', StandardScaler(), ['accommodates','bedrooms','beds','bathrooms'])
    ],
    remainder='passthrough')

multi_output_classifier = MultiOutputClassifier(XGBClassifier(objective='binary:logistic'))
rf_pipeline = Pipeline([
    ('preprocessor', preprocessor), 
    ('classifier', multi_output_classifier)
])

rf_pipeline.fit(X_train, y_train)
predicted_amenities = rf_pipeline.predict(X_test)
predicted_amenities_labels = mlb.inverse_transform(predicted_amenities)
predicted_amenities_labels[:5]
Out[52]:
[('24-hour check-in',
  'Air conditioning',
  'Buzzer/wireless intercom',
  'Cable TV',
  'Carbon monoxide detector',
  'Dryer',
  'Essentials',
  'Family/kid friendly',
  'Fire extinguisher',
  'First aid kit',
  'Free parking on premises',
  'Hair dryer',
  'Hangers',
  'Heating',
  'Internet',
  'Iron',
  'Kitchen',
  'Laptop friendly workspace',
  'Self Check-In',
  'Shampoo',
  'Smoke detector',
  'TV',
  'Washer',
  'Wireless Internet'),
 ('Cable TV',
  'Carbon monoxide detector',
  'Dryer',
  'Essentials',
  'Fire extinguisher',
  'Hair dryer',
  'Hangers',
  'Heating',
  'Internet',
  'Kitchen',
  'Laptop friendly workspace',
  'Shampoo',
  'Smoke detector',
  'TV',
  'Washer',
  'Wireless Internet'),
 ('Air conditioning',
  'Carbon monoxide detector',
  'Essentials',
  'Hair dryer',
  'Hangers',
  'Heating',
  'Internet',
  'Iron',
  'Kitchen',
  'Laptop friendly workspace',
  'Shampoo',
  'Smoke detector',
  'TV',
  'Wireless Internet'),
 ('Air conditioning',
  'Essentials',
  'Hangers',
  'Heating',
  'Kitchen',
  'Smoke detector',
  'TV',
  'Wireless Internet'),
 ('Carbon monoxide detector',
  'Dryer',
  'Essentials',
  'Fire extinguisher',
  'Hair dryer',
  'Heating',
  'Internet',
  'Kitchen',
  'Laptop friendly workspace',
  'Shampoo',
  'Smoke detector',
  'TV',
  'Washer',
  'Wireless Internet')]
In [53]:
average_f1_score = f1_score(y_test, predicted_amenities, average='micro')
print(f"Average F1 Score: {average_f1_score}")
Average F1 Score: 0.7217858015168374
In [59]:
pca_pipeline = Pipeline([('preprocessor', preprocessor), ('pca', PCA())])
pca_pipeline.fit_transform(X)
pipeline = Pipeline([
    ('pca', pca_pipeline),
    ('classifier', multi_output_classifier)
])
cv_scores = cross_val_score(pipeline, X, y, cv=5, scoring='neg_mean_squared_error')
mse_cv = -cv_scores.mean()
plt.figure(figsize=(10, 6))
pca = pca_pipeline.named_steps['pca']
plt.plot(range(1, len(pca.explained_variance_ratio_) + 1), np.cumsum(pca.explained_variance_ratio_), marker='o', linestyle='-')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance Ratio')
plt.title('Explained Variance Ratio by Number of Components')
plt.grid(True)
plt.show()
No description has been provided for this image
In [54]:
rf_pipeline_pca = Pipeline([
    ('pca', Pipeline([('preprocessor', preprocessor), ('pca', PCA(n_components=20))])),
    ('classifier', multi_output_classifier)
])

rf_pipeline_pca.fit(X_train, y_train)
Out[54]:
Pipeline(steps=[('pca',
                 Pipeline(steps=[('preprocessor',
                                  ColumnTransformer(remainder='passthrough',
                                                    transformers=[('encoder',
                                                                   OneHotEncoder(sparse_output=False),
                                                                   ['property_type',
                                                                    'city']),
                                                                  ('ord',
                                                                   OrdinalEncoder(categories=[['Shared '
                                                                                               'room',
                                                                                               'Private '
                                                                                               'room',
                                                                                               'Entire '
                                                                                               'home/apt'],
                                                                                              ['flexible',
                                                                                               'moderate',
                                                                                               'strict',
                                                                                               'super_strict_30',
                                                                                               'super_strict_60']]),
                                                                   ['room_typ...
                                                               grow_policy=None,
                                                               importance_type=None,
                                                               interaction_constraints=None,
                                                               learning_rate=None,
                                                               max_bin=None,
                                                               max_cat_threshold=None,
                                                               max_cat_to_onehot=None,
                                                               max_delta_step=None,
                                                               max_depth=None,
                                                               max_leaves=None,
                                                               min_child_weight=None,
                                                               missing=nan,
                                                               monotone_constraints=None,
                                                               multi_strategy=None,
                                                               n_estimators=None,
                                                               n_jobs=None,
                                                               num_parallel_tree=None,
                                                               random_state=None, ...)))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('pca',
                 Pipeline(steps=[('preprocessor',
                                  ColumnTransformer(remainder='passthrough',
                                                    transformers=[('encoder',
                                                                   OneHotEncoder(sparse_output=False),
                                                                   ['property_type',
                                                                    'city']),
                                                                  ('ord',
                                                                   OrdinalEncoder(categories=[['Shared '
                                                                                               'room',
                                                                                               'Private '
                                                                                               'room',
                                                                                               'Entire '
                                                                                               'home/apt'],
                                                                                              ['flexible',
                                                                                               'moderate',
                                                                                               'strict',
                                                                                               'super_strict_30',
                                                                                               'super_strict_60']]),
                                                                   ['room_typ...
                                                               grow_policy=None,
                                                               importance_type=None,
                                                               interaction_constraints=None,
                                                               learning_rate=None,
                                                               max_bin=None,
                                                               max_cat_threshold=None,
                                                               max_cat_to_onehot=None,
                                                               max_delta_step=None,
                                                               max_depth=None,
                                                               max_leaves=None,
                                                               min_child_weight=None,
                                                               missing=nan,
                                                               monotone_constraints=None,
                                                               multi_strategy=None,
                                                               n_estimators=None,
                                                               n_jobs=None,
                                                               num_parallel_tree=None,
                                                               random_state=None, ...)))])
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('encoder',
                                                  OneHotEncoder(sparse_output=False),
                                                  ['property_type', 'city']),
                                                 ('ord',
                                                  OrdinalEncoder(categories=[['Shared '
                                                                              'room',
                                                                              'Private '
                                                                              'room',
                                                                              'Entire '
                                                                              'home/apt'],
                                                                             ['flexible',
                                                                              'moderate',
                                                                              'strict',
                                                                              'super_strict_30',
                                                                              'super_strict_60']]),
                                                  ['room_type',
                                                   'cancellation_policy']),
                                                 ('scaler', StandardScaler(),
                                                  ['accommodates', 'bedrooms',
                                                   'beds', 'bathrooms'])])),
                ('pca', PCA(n_components=20))])
ColumnTransformer(remainder='passthrough',
                  transformers=[('encoder', OneHotEncoder(sparse_output=False),
                                 ['property_type', 'city']),
                                ('ord',
                                 OrdinalEncoder(categories=[['Shared room',
                                                             'Private room',
                                                             'Entire home/apt'],
                                                            ['flexible',
                                                             'moderate',
                                                             'strict',
                                                             'super_strict_30',
                                                             'super_strict_60']]),
                                 ['room_type', 'cancellation_policy']),
                                ('scaler', StandardScaler(),
                                 ['accommodates', 'bedrooms', 'beds',
                                  'bathrooms'])])
['property_type', 'city']
OneHotEncoder(sparse_output=False)
['room_type', 'cancellation_policy']
OrdinalEncoder(categories=[['Shared room', 'Private room', 'Entire home/apt'],
                           ['flexible', 'moderate', 'strict', 'super_strict_30',
                            'super_strict_60']])
['accommodates', 'bedrooms', 'beds', 'bathrooms']
StandardScaler()
['cleaning_fee', 'host_has_profile_pic', 'host_identity_verified', 'instant_bookable', 'price']
passthrough
PCA(n_components=20)
MultiOutputClassifier(estimator=XGBClassifier(base_score=None, booster=None,
                                              callbacks=None,
                                              colsample_bylevel=None,
                                              colsample_bynode=None,
                                              colsample_bytree=None,
                                              device=None,
                                              early_stopping_rounds=None,
                                              enable_categorical=False,
                                              eval_metric=None,
                                              feature_types=None, gamma=None,
                                              grow_policy=None,
                                              importance_type=None,
                                              interaction_constraints=None,
                                              learning_rate=None, max_bin=None,
                                              max_cat_threshold=None,
                                              max_cat_to_onehot=None,
                                              max_delta_step=None,
                                              max_depth=None, max_leaves=None,
                                              min_child_weight=None,
                                              missing=nan,
                                              monotone_constraints=None,
                                              multi_strategy=None,
                                              n_estimators=None, n_jobs=None,
                                              num_parallel_tree=None,
                                              random_state=None, ...))
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, random_state=None, ...)
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, random_state=None, ...)
In [74]:
cat_cols = ['property_type','room_type','bed_type','cancellation_policy','city']
data_json = {}
for col in cat_cols:
    values = cleaned_df[col].unique()
    data_json[col] = values.tolist()
with open('obj/cat_data.json', 'w', encoding='utf-8') as f:
    json.dump(data_json, f, ensure_ascii=False, indent=4)

with open('obj/price_predictor.pkl', 'wb') as file:
    pickle.dump(svr_pipeline_pca, file)

with open('obj/label_binarizer.pkl', 'wb') as file:  
    pickle.dump(mlb, file)

with open('obj/amenities_predictor.pkl', 'wb') as f:
    pickle.dump(rf_pipeline_pca, f)
In [58]:
# import shutil

# shutil.copytree('obj/', 'bin/')